Linear Regression

This is a House Price Prediction Using Linear Regression Model

At first we install all the required packages for the linear regression.

# install.packages('readr', repos = "http://cran.us.r-project.org")
# install.packages('ggplot2', repos = "http://cran.us.r-project.org")
# install.packages('mlbench', repos = "http://cran.us.r-project.org")
# install.packages('corrplot', repos = "http://cran.us.r-project.org")
# install.packages('Amelia', repos = "http://cran.us.r-project.org")
# install.packages('caret', repos = "http://cran.us.r-project.org")
# install.packages('plotly', repos = "http://cran.us.r-project.org")
# install.packages('caTools', repos = "http://cran.us.r-project.org")
# install.packages('reshape2', repos = "http://cran.us.r-project.org")
# install.packages('dplyr', repos = "http://cran.us.r-project.org")
library(readr)
library(ggplot2)
library(corrplot)
library(mlbench)
library(Amelia)
library(plotly)
library(reshape2)
library(caret)
library(caTools)
library(dplyr)

We input the cleaned dataset

data(Housing)
housing <- Housing

Exploratory Data Analysis

Visualizations

Correlation

corrplot(cor(select(housing,-chas)))

Density Plot using ggplot2

housing %>% 
  ggplot(aes(medv)) +
  stat_density() + 
  theme_bw()

Density Plot using plotly

ggplotly(housing %>% 
  ggplot(aes(medv)) +
  stat_density() + 
  theme_bw())

medv

housing %>%
  select(c(crim, rm, age, rad, tax, lstat, medv,indus,nox,ptratio,zn)) %>%
  melt(id.vars = "medv") %>%
  ggplot(aes(x = value, y = medv, colour = variable)) +
  geom_point(alpha = 0.7) +
  stat_smooth(aes(colour = "black")) +
  facet_wrap(~variable, scales = "free", ncol = 2) +
  labs(x = "Variable Value", y = "Median House Price ($1000s)") +
  theme_minimal()
`geom_smooth()` using method = 'loess' and formula 'y ~ x'
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,  :
  pseudoinverse used at -0.5
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,  :
  neighborhood radius 13
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,  :
  reciprocal condition number  4.5194e-15
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,  :
  There are other near singularities as well. 156.25
Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x else if (is.data.frame(newdata)) as.matrix(model.frame(delete.response(terms(object)),  :
  pseudoinverse used at -0.5
Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x else if (is.data.frame(newdata)) as.matrix(model.frame(delete.response(terms(object)),  :
  neighborhood radius 13
Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x else if (is.data.frame(newdata)) as.matrix(model.frame(delete.response(terms(object)),  :
  reciprocal condition number  4.5194e-15
Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x else if (is.data.frame(newdata)) as.matrix(model.frame(delete.response(terms(object)),  :
  There are other near singularities as well. 156.25

Model Building & Prediction

Train and Test Data

set.seed(123)
split <- sample.split(housing,SplitRatio =0.75)
train <- subset(housing,split==TRUE)
test <- subset(housing,split==FALSE)

Training The Model

model <- lm(medv ~ crim + rm + tax + age + nox + lstat , data = train)
summary(model)

Call:
lm(formula = medv ~ crim + rm + tax + age + nox + lstat, data = train)

Residuals:
     Min       1Q   Median       3Q      Max 
-16.9490  -3.2287  -0.9126   2.2436  29.2666 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -4.090441   3.871134  -1.057   0.2914    
crim        -0.075304   0.036483  -2.064   0.0397 *  
rm           5.457783   0.513535  10.628  < 2e-16 ***
tax         -0.005835   0.002350  -2.483   0.0135 *  
age          0.014093   0.015452   0.912   0.3624    
nox          1.185187   3.840734   0.309   0.7578    
lstat       -0.538999   0.068160  -7.908 3.35e-14 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 5.206 on 355 degrees of freedom
Multiple R-squared:  0.6769,    Adjusted R-squared:  0.6714 
F-statistic:   124 on 6 and 355 DF,  p-value: < 2.2e-16

Visualizing The Model

res <- residuals(model)
res <- as.data.frame(res)
ggplot(res,aes(res)) +  geom_histogram(fill='blue',alpha=0.5)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

plot(model)

Predictions

test$predicted.medv <- predict(model,test)

pl1 <-test %>% 
  ggplot(aes(medv,predicted.medv)) +
  geom_point(alpha=0.5) + 
  stat_smooth(aes(colour='black')) +
  xlab('Actual value of medv') +
  ylab('Predicted value of medv')+
  theme_bw()

ggplotly(pl1)
`geom_smooth()` using method = 'loess' and formula 'y ~ x'

Assessing our Model

error <- test$medv-test$predicted.medv
rmse <- sqrt(mean(error)^2)
LS0tCnRpdGxlOiAiQ1NFNDAyNyAtIEhvdXNlIFByaWNpbmciCm91dHB1dDoKICB3b3JkX2RvY3VtZW50OiBkZWZhdWx0CiAgcGRmX2RvY3VtZW50OiBkZWZhdWx0CiAgaHRtbF9ub3RlYm9vazogZGVmYXVsdAotLS0KCgojIExpbmVhciBSZWdyZXNzaW9uClRoaXMgaXMgYSBIb3VzZSBQcmljZSBQcmVkaWN0aW9uIFVzaW5nIExpbmVhciBSZWdyZXNzaW9uICBNb2RlbAoKQXQgZmlyc3Qgd2UgaW5zdGFsbCBhbGwgdGhlIHJlcXVpcmVkIHBhY2thZ2VzIGZvciB0aGUgbGluZWFyIHJlZ3Jlc3Npb24uCmBgYHtyfQojIGluc3RhbGwucGFja2FnZXMoJ3JlYWRyJywgcmVwb3MgPSAiaHR0cDovL2NyYW4udXMuci1wcm9qZWN0Lm9yZyIpCiMgaW5zdGFsbC5wYWNrYWdlcygnZ2dwbG90MicsIHJlcG9zID0gImh0dHA6Ly9jcmFuLnVzLnItcHJvamVjdC5vcmciKQojIGluc3RhbGwucGFja2FnZXMoJ21sYmVuY2gnLCByZXBvcyA9ICJodHRwOi8vY3Jhbi51cy5yLXByb2plY3Qub3JnIikKIyBpbnN0YWxsLnBhY2thZ2VzKCdjb3JycGxvdCcsIHJlcG9zID0gImh0dHA6Ly9jcmFuLnVzLnItcHJvamVjdC5vcmciKQojIGluc3RhbGwucGFja2FnZXMoJ0FtZWxpYScsIHJlcG9zID0gImh0dHA6Ly9jcmFuLnVzLnItcHJvamVjdC5vcmciKQojIGluc3RhbGwucGFja2FnZXMoJ2NhcmV0JywgcmVwb3MgPSAiaHR0cDovL2NyYW4udXMuci1wcm9qZWN0Lm9yZyIpCiMgaW5zdGFsbC5wYWNrYWdlcygncGxvdGx5JywgcmVwb3MgPSAiaHR0cDovL2NyYW4udXMuci1wcm9qZWN0Lm9yZyIpCiMgaW5zdGFsbC5wYWNrYWdlcygnY2FUb29scycsIHJlcG9zID0gImh0dHA6Ly9jcmFuLnVzLnItcHJvamVjdC5vcmciKQojIGluc3RhbGwucGFja2FnZXMoJ3Jlc2hhcGUyJywgcmVwb3MgPSAiaHR0cDovL2NyYW4udXMuci1wcm9qZWN0Lm9yZyIpCiMgaW5zdGFsbC5wYWNrYWdlcygnZHBseXInLCByZXBvcyA9ICJodHRwOi8vY3Jhbi51cy5yLXByb2plY3Qub3JnIikKbGlicmFyeShyZWFkcikKbGlicmFyeShnZ3Bsb3QyKQpsaWJyYXJ5KGNvcnJwbG90KQpsaWJyYXJ5KG1sYmVuY2gpCmxpYnJhcnkoQW1lbGlhKQpsaWJyYXJ5KHBsb3RseSkKbGlicmFyeShyZXNoYXBlMikKbGlicmFyeShjYXJldCkKbGlicmFyeShjYVRvb2xzKQpsaWJyYXJ5KGRwbHlyKQoKYGBgCgpXZSBpbnB1dCB0aGUgY2xlYW5lZCBkYXRhc2V0CmBgYHtyfQpkYXRhKEhvdXNpbmcpCmhvdXNpbmcgPC0gSG91c2luZwpgYGAKCiMjIEV4cGxvcmF0b3J5IERhdGEgQW5hbHlzaXMKIyMjIFZpc3VhbGl6YXRpb25zCgojIyMjIENvcnJlbGF0aW9uCgpgYGB7cn0KY29ycnBsb3QoY29yKHNlbGVjdChob3VzaW5nLC1jaGFzKSkpIApgYGAKCiMjIyMgRGVuc2l0eSBQbG90IHVzaW5nIGdncGxvdDIKYGBge3J9CmhvdXNpbmcgJT4lIAogIGdncGxvdChhZXMobWVkdikpICsKICBzdGF0X2RlbnNpdHkoKSArIAogIHRoZW1lX2J3KCkKYGBgCgojIyMjIERlbnNpdHkgUGxvdCB1c2luZyBwbG90bHkKYGBge3J9CmdncGxvdGx5KGhvdXNpbmcgJT4lIAogIGdncGxvdChhZXMobWVkdikpICsKICBzdGF0X2RlbnNpdHkoKSArIAogIHRoZW1lX2J3KCkpCmBgYAoKIyMjIyBtZWR2CmBgYHtyfQpob3VzaW5nICU+JQogIHNlbGVjdChjKGNyaW0sIHJtLCBhZ2UsIHJhZCwgdGF4LCBsc3RhdCwgbWVkdixpbmR1cyxub3gscHRyYXRpbyx6bikpICU+JQogIG1lbHQoaWQudmFycyA9ICJtZWR2IikgJT4lCiAgZ2dwbG90KGFlcyh4ID0gdmFsdWUsIHkgPSBtZWR2LCBjb2xvdXIgPSB2YXJpYWJsZSkpICsKICBnZW9tX3BvaW50KGFscGhhID0gMC43KSArCiAgc3RhdF9zbW9vdGgoYWVzKGNvbG91ciA9ICJibGFjayIpKSArCiAgZmFjZXRfd3JhcCh+dmFyaWFibGUsIHNjYWxlcyA9ICJmcmVlIiwgbmNvbCA9IDIpICsKICBsYWJzKHggPSAiVmFyaWFibGUgVmFsdWUiLCB5ID0gIk1lZGlhbiBIb3VzZSBQcmljZSAoJDEwMDBzKSIpICsKICB0aGVtZV9taW5pbWFsKCkKYGBgCgoKIyMgTW9kZWwgQnVpbGRpbmcgJiBQcmVkaWN0aW9uCiMjIyBUcmFpbiBhbmQgVGVzdCBEYXRhCmBgYHtyfQpzZXQuc2VlZCgxMjMpCnNwbGl0IDwtIHNhbXBsZS5zcGxpdChob3VzaW5nLFNwbGl0UmF0aW8gPTAuNzUpCnRyYWluIDwtIHN1YnNldChob3VzaW5nLHNwbGl0PT1UUlVFKQp0ZXN0IDwtIHN1YnNldChob3VzaW5nLHNwbGl0PT1GQUxTRSkKYGBgCgojIyMgVHJhaW5pbmcgVGhlIE1vZGVsCmBgYHtyfQptb2RlbCA8LSBsbShtZWR2IH4gY3JpbSArIHJtICsgdGF4ICsgYWdlICsgbm94ICsgbHN0YXQgLCBkYXRhID0gdHJhaW4pCnN1bW1hcnkobW9kZWwpCmBgYAojIyMgVmlzdWFsaXppbmcgVGhlIE1vZGVsCmBgYHtyfQpyZXMgPC0gcmVzaWR1YWxzKG1vZGVsKQpyZXMgPC0gYXMuZGF0YS5mcmFtZShyZXMpCmdncGxvdChyZXMsYWVzKHJlcykpICsgIGdlb21faGlzdG9ncmFtKGZpbGw9J2JsdWUnLGFscGhhPTAuNSkKYGBgCgpgYGB7cn0KcGxvdChtb2RlbCkKYGBgCgoKIyMjIFByZWRpY3Rpb25zCmBgYHtyfQp0ZXN0JHByZWRpY3RlZC5tZWR2IDwtIHByZWRpY3QobW9kZWwsdGVzdCkKCnBsMSA8LXRlc3QgJT4lIAogIGdncGxvdChhZXMobWVkdixwcmVkaWN0ZWQubWVkdikpICsKICBnZW9tX3BvaW50KGFscGhhPTAuNSkgKyAKICBzdGF0X3Ntb290aChhZXMoY29sb3VyPSdibGFjaycpKSArCiAgeGxhYignQWN0dWFsIHZhbHVlIG9mIG1lZHYnKSArCiAgeWxhYignUHJlZGljdGVkIHZhbHVlIG9mIG1lZHYnKSsKICB0aGVtZV9idygpCgpnZ3Bsb3RseShwbDEpCmBgYAoKIyMjIEFzc2Vzc2luZyBvdXIgTW9kZWwKYGBge3J9CmVycm9yIDwtIHRlc3QkbWVkdi10ZXN0JHByZWRpY3RlZC5tZWR2CnJtc2UgPC0gc3FydChtZWFuKGVycm9yKV4yKQpgYGAK